
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.offline as py
import calmap
from IPython.display import set_matplotlib_formats
from datetime import date, timedelta
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, add_changepoints_to_plot
from scipy.special import inv_boxcox
# 경고 숨기기
import warnings
warnings.filterwarnings('ignore')
# 컬러
cnfcol = '#393e46' # 확진자 - 회색
dthcol = '#ff2e63' # 사망자 - 빨간색
reccol = '#21bf73' # 회복자 - 청록색
actcol = '#fe9801' # 대기자 - 노란색
# 한글 폰트 및 레티나 설정
plt.rc("font" , family = "Malgun Gothic")
plt.rc("axes" , unicode_minus = False)
set_matplotlib_formats('retina')
# 지도시각화
import requests
from bs4 import BeautifulSoup as bs
import json
from pandas.io.json import json_normalize
import os
# 코드 숨기기
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
full_table = pd.read_csv('data/covid_19_clean_complete.csv', parse_dates=['Date'])
# cases
cases = ['Confirmed', 'Deaths', 'Recovered', 'Active']
# Active Case = confirmed - deaths - recovered
full_table['Active'] = full_table['Confirmed'] - full_table['Deaths'] - full_table['Recovered']
# replacing Mainland china with just China
full_table['Country/Region'] = full_table['Country/Region'].replace('Mainland China', 'China')
# filling missing values
full_table[['Province/State']] = full_table[['Province/State']].fillna('')
full_table[cases] = full_table[cases].fillna(0)
# latest
full_latest = full_table[full_table['Date'] == max(full_table['Date'])].reset_index()
row_latest = full_latest[full_latest['Country/Region']!='China']
# latest condensed
full_latest_grouped = full_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
row_latest_grouped = row_latest.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp = full_table.groupby(['Country/Region', 'Province/State'])['Confirmed', 'Deaths', 'Recovered', 'Active'].max()
# temp.style.background_gradient(cmap='Reds')
temp = full_table.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
temp.style.background_gradient(cmap='Pastel1')
tm = temp.melt(id_vars="Date", value_vars=['Active', 'Deaths', 'Recovered'])
fig = px.treemap(tm, path=["variable"], values="value", height=400, width=600,
color_discrete_sequence=[reccol, actcol, dthcol])
fig.show()
flg = full_latest_grouped
fig = px.bar(flg.sort_values('Confirmed', ascending=False).head(20).sort_values('Confirmed', ascending=True),
x="Confirmed", y="Country/Region", title='누적 확진자 수', text='Confirmed', orientation='h',
width=700, height=700, range_x = [0, max(flg['Confirmed'])+10000])
fig.update_traces(marker_color='#46cdcf', opacity=0.8, textposition='outside')
fig.show()
temp_f = full_latest_grouped.sort_values(by='Confirmed', ascending=False)
temp_f = temp_f.reset_index(drop=True)
temp_f.style.background_gradient(cmap='Reds')
temp_flg = temp_f[temp_f['Deaths']>0][['Country/Region', 'Deaths']]
temp_flg.sort_values('Deaths', ascending=False).reset_index(drop=True).head(10).style.background_gradient(cmap='Reds')
fig = px.bar(flg.sort_values('Deaths', ascending=False).head(10).sort_values('Deaths', ascending=True),
x="Deaths", y="Country/Region", title='누적 사망자 수', text='Deaths', orientation='h',
width=700, height=700, range_x = [0, max(flg['Deaths'])+500])
fig.update_traces(marker_color=dthcol, opacity=0.6, textposition='outside')
fig.show()
temp = temp_f[temp_f['Recovered']==0][['Country/Region', 'Confirmed', 'Deaths', 'Recovered']]
temp.head(10).reset_index(drop=True).style.background_gradient(cmap='Reds')
temp = row_latest_grouped[row_latest_grouped['Confirmed']==
row_latest_grouped['Deaths']]
temp = temp[['Country/Region', 'Confirmed', 'Deaths']]
temp = temp.sort_values('Confirmed', ascending=False)
temp = temp.reset_index(drop=True)
temp.style.background_gradient(cmap='Reds')
temp = row_latest_grouped[row_latest_grouped['Confirmed']==
row_latest_grouped['Recovered']]
temp = temp[['Country/Region', 'Confirmed', 'Recovered']]
temp = temp.sort_values('Confirmed', ascending=False)
temp = temp.reset_index(drop=True)
temp.style.background_gradient(cmap='Greens')
fig = px.bar(flg.sort_values('Recovered', ascending=False).head(10).sort_values('Recovered', ascending=True),
x="Recovered", y="Country/Region", title='누적 격리해제 수', text='Recovered', orientation='h',
width=700, height=700, range_x = [0, max(flg['Recovered'])+10000])
fig.update_traces(marker_color=reccol, opacity=0.6, textposition='outside')
fig.show()
fig = px.bar(flg.sort_values('Active', ascending=False).head(20).sort_values('Active', ascending=True),
x="Active", y="Country/Region", title='현재 환자 수', text='Active', orientation='h',
width=700, height=700, range_x = [0, max(flg['Active'])+3000])
fig.update_traces(marker_color='#f0134d', opacity=0.6, textposition='outside')
fig.show()
# (Only countries with more than 100 case are considered)
flg['Mortality Rate'] = round((flg['Deaths']/flg['Confirmed'])*100, 2)
temp = flg[flg['Confirmed']>100]
temp = temp.sort_values('Mortality Rate', ascending=False)
fig = px.bar(temp.sort_values('Mortality Rate', ascending=False).head(23).sort_values('Mortality Rate', ascending=True),
x="Mortality Rate", y="Country/Region", text='Mortality Rate', orientation='h',
width=700, height=600, range_x = [0, 10], title='치사율')
fig.update_traces(marker_color='#00a8cc', opacity=0.6, textposition='outside')
fig.show()
import math
dead = pd.read_csv('data/deaths.csv')
dead = dead.fillna(0)
math.floor(np.mean(dead['나이']))
df_predict = pd.read_csv("data/time_series_19-covid-Confirmed_3.csv")
df_korea = df_predict[df_predict["Country/Region"] == "South Korea"]
df_korea.head()
df_korea = df_korea.T[4:]
df_korea = df_korea.reset_index().rename(columns = {'index':'date' ,33:'confirmed'})
df_korea['date'] = pd.to_datetime(df_korea['date'])
df_korea.head()
df_prophet = df_korea.rename(columns = {
'date' :'ds',
'confirmed' : 'y'
})
df_prophet.head()
m = Prophet(
changepoint_prior_scale = 0.2, #값이 커질수록 모델을 유연하게
changepoint_range = 0.98, #데이터 앞쪽 몇퍼센트 부분에서 변곡점을 만들것인지 설정
yearly_seasonality = False,
weekly_seasonality = False,
daily_seasonality = True,
#additive - 계절변동의 영향이 트렌드에 더해지는 형태로 나타날때 사용
seasonality_mode = 'additive'
)
m.fit(df_prophet)
future = m.make_future_dataframe(periods = 7)
future.tail(7)
forecast = m.predict(future)
forecast[['ds','yhat','yhat_lower','yhat_upper']].tail(7)
#ds날짜 , yhat예측값
#yhat_lower 오차를 고려한 예측최솟값 , yhat_upper - 오차를 고려한 예측 최댓값
fig = plot_plotly(m,forecast)
py.iplot(fig)
df_check = pd.read_csv("data/검사대비확진비율.csv")
df_cum = pd.read_csv("data/일별확진자.csv")
df_check['일자'] = df_check["일자"].astype(str).map(lambda x : x[5:])
df_cum = df_cum.reset_index()
df_cum['일자'] = df_cum["일자"].astype(str).map(lambda x : x[5:])
df_check = df_check.set_index("일자")
df_cum = df_cum.set_index("일자")
print(df_check.head(5))
print(df_cum.head(5))
df_dates = pd.read_csv('data/covid19_dates.csv')
df_dates = df_dates.T
df_dates = df_dates.reset_index()
df_dates.columns = ['dates','variation','confirmed']
df_dates = df_dates.drop(0)
plt.figure(figsize=(20,8))
plt.xticks(rotation=90)
sns.pointplot(data=df_dates, x='dates', y='confirmed')
g = sns.barplot(data=df_dates, x='dates', y='variation')
for i, s in enumerate(df_dates['confirmed']) :
if s > 1000 :
g.text(x=i-1, y=s+450, s=s)
fig , ax0 = plt.subplots(figsize=(20, 5))
ax1 = ax0.twinx()
ax0.set_title("일별 확진자추이")
ax0.plot(df_cum["누적확진자"] ,'r-', label = "누적확진자" )
ax0.set_ylabel("누적확진자")
ax0.grid(False)
ax1.plot(df_cum["발생확진자"] ,'g:', label ="발생확진자")
ax1.set_ylabel("발생확진자")
ax1.grid(False)
ax0.set_xlabel("날짜")
fig.legend()
plt.show()
fig , ax0 = plt.subplots(figsize=(20, 5))
ax1 = ax0.twinx()
ax0.set_title("검사대비 확진비율")
ax0.plot(df_check["음성판정수"] ,'r-', label = "음성판정수" )
ax0.set_ylabel("음성판정수")
ax0.grid(False)
ax1.plot(df_check["확진률"] ,'g:', label ="확진률")
ax1.set_ylabel("확진률")
ax1.grid(False)
ax0.set_xlabel("날짜")
fig.legend()
plt.show()
# 데이터를 불러옵니다.
sub = pd.read_csv("./data/서울시 지하철 호선별 역별 유_무임 승하차 인원 정보.csv", encoding='cp949')
# 작년 12월 ~ 2월과 추이를 비교하기 위해서 그 이전 데이터는 제거합니다.
drop_index = sub[sub['사용월'] < 201811].index
sub.drop(drop_index, inplace=True)
# 필요없는 컬럼은 제거합니다.
sub = sub.drop(['작업일자'], axis=1).copy()
sub.groupby(['호선명'])['유임승차인원'].median().sort_values(ascending=False).plot.barh(figsize=(15, 7), color='green')
sub.groupby(['호선명'])['무임승차인원'].median().sort_values(ascending=False).plot.barh(figsize=(15, 7))
# 호선명에서 유/무임 승차인원이 많은 노선 10개만 line_df 에 담아줍니다.
# df['호선명'].unique()
pay_index = sub.groupby(['호선명'])['유임승차인원'].median().sort_values(ascending=False).head(15).index
# print(pay_index)
nonpay_index = sub.groupby(['호선명'])['무임승차인원'].median().sort_values(ascending=False).head(15).index
# print(nonpay_index)
line_list = []
for element in pay_index:
if element in nonpay_index:
line_list.append(element)
print(line_list)
line_df = sub[sub['호선명'].isin(line_list)]
# 지하철역에서 유/무임 승차인원이 많은 역을 station_df 에 담아줍니다.
# line_df['지하철역'].nunique()
pay_index = line_df.groupby(['지하철역'])['유임승차인원'].median().sort_values(ascending=False).head(30).index
# print(pay_index)
nonpay_index = line_df.groupby(['지하철역'])['무임승차인원'].median().sort_values(ascending=False).head(30).index
# print(nonpay_index)
station_list = []
for element in pay_index:
if element in nonpay_index:
station_list.append(element)
print(station_list)
station_df = line_df[line_df['지하철역'].isin(station_list)]
station_df.groupby(['지하철역'])['유임승차인원'].median().sort_values(ascending=False).plot.barh(figsize=(20, 7), color='green')
station_df.groupby(['지하철역'])['무임승차인원'].median().sort_values(ascending=False).plot.barh(figsize=(20, 7))
# 하나의 데이터 셋에 승차인원과 유/무임 여부를 정리해줍니다.
pay = station_df[['사용월', '호선명', '지하철역', '유임승차인원']]
pay['유임/무임'] = '유임'
pay = pay.rename(columns = {'유임승차인원' : '승차인원'})
npay = station_df[['사용월', '호선명', '지하철역', '무임승차인원']]
npay['유임/무임'] = '무임'
npay = npay.rename(columns = {'무임승차인원' : '승차인원'})
df_station = pd.concat([pay, npay])
df_station.head()
plt.figure(figsize=(20, 7))
plt.axvline(x=2.5, ymin=0, ymax=1, linewidth=3, color='red')
plt.axvline(x=14.5, ymin=0, ymax=1, linewidth=3, color='red')
# plt.axvline(x=14.8, ymin=0, ymax=1, linewidth=3, color='green')
sns.pointplot(data=df_station, x='사용월', y='승차인원', hue='유임/무임', ci=None)
# 데이터를 불러옵니다.
# 12월
bus1912 = pd.read_csv('./data/BUS_STATION_BOARDING_MONTH_201912_1.csv', encoding='cp949')
bus1912_0 = bus1912[['사용일자', '역명', '승차총승객수', '하차총승객수']]
# bus1912_0.tail() # 20191201 ~ 20191231
# 1월
bus2001 = pd.read_csv('./data/BUS_STATION_BOARDING_MONTH_202001.csv', encoding='cp949')
bus2001_0 = bus2001[['사용일자', '역명', '승차총승객수', '하차총승객수']]
# bus2001_0.tail() # 20200101 ~ 20200131
# 2월
bus2002 = pd.read_csv('./data/BUS_STATION_BOARDING_MONTH_202002.csv', encoding='cp949')
bus2002_0 = bus2002[['사용일자', '역명', '승차총승객수', '하차총승객수']]
# bus2002_0.tail() # 20200201 ~ 20200229
# 3월
bus2003 = pd.read_csv('./data/서울시 버스노선별 정류장별 승하차 인원 정보.csv', encoding='cp949')
# bus2003.head()
bus2003_0 = bus2003[bus2003['사용일자'] > 20200229].sort_values(by='사용일자').reset_index().drop('index', axis=1)
bus2003_0 = bus2003_0[['사용일자', '역명', '승차총승객수', '하차총승객수']].copy()
# bus2003_0.tail() # 20200301 ~ 20200315
# 12월부터 3월까지 데이터를 합쳐줍니다.
bus = pd.concat([bus1912_0, bus2001_0, bus2002_0, bus2003_0])
bus.head()
# 승/하차인원이 많은 역만 선별해 station_df 에 담아줍니다.
on_index = bus.groupby(['역명'])['승차총승객수'].median().sort_values(ascending=False).head(15).index
off_index = bus.groupby(['역명'])['하차총승객수'].median().sort_values(ascending=False).head(15).index
station_list = []
for element in on_index:
if element in off_index:
station_list.append(element)
print(station_list)
station_df = bus[bus['역명'].isin(station_list)]
# 히트맵으로 일별 승차인원 변화를 봅니다.
pivot = station_df.pivot_table(index='사용일자', columns='역명', values='승차총승객수')
plt.figure(figsize=(8, 30))
sns.heatmap(data=pivot, annot=False, fmt='.0f', linewidths=5, cmap='Blues')
on = station_df[['사용일자', '역명', '승차총승객수']]
on['승하차여부'] = '승차'
on = on.rename(columns = {'승차총승객수' : '승하차인원'})
off = station_df[['사용일자', '역명', '하차총승객수']]
off['승하차여부'] = '하차'
off = off.rename(columns = {'하차총승객수' : '승하차인원'})
df_fin = pd.concat([on, off])
# 일별 승/하차인원 변화를 봅니다.
plt.figure(figsize=(20, 7))
plt.xticks(rotation=90)
plt.axvline(x=49, ymin=0, ymax=1, linewidth=3, color='red') # 1월19일 (한국 코로나 발발)
plt.axvline(x=89, ymin=0, ymax=1, linewidth=3, color='green') # 2월28일 ('사회적 거리두기')
sns.pointplot(data=df_fin, x='사용일자', y='승하차인원', hue='승하차여부', ci=None)
korea = pd.read_csv('data/covid19_korea.csv')
group = korea.groupby(['지역'])['누적확진자','격리중','사망자','격리해제'].max().copy()
group = group.sort_values('누적확진자', ascending=False)
group.style.background_gradient(cmap='Pastel1_r')
deaths = korea[['지역','사망자']]
deaths = deaths.sort_values(by='사망자',ascending=False).head(10)
deaths.style.background_gradient(cmap='Reds')
no_recovered = korea[korea['격리해제'] == 0]
no_recovered = no_recovered.sort_values('누적확진자', ascending=False)
no_recovered.style.background_gradient(cmap='Reds')
all_recovered = korea[korea['누적확진자'] == korea['격리해제']]
all_recovered = all_recovered.sort_values('누적확진자', ascending=False)
all_recovered.style.background_gradient(cmap='Greens')
fig = px.bar(korea[['지역','누적확진자']].
sort_values('누적확진자', ascending=False),
y='누적확진자', x='지역', color='지역',
log_y=True, template='ggplot2',
title='지역별 확진자수')
fig.show()
fig = px.bar(korea[['지역','사망자']].
sort_values('사망자', ascending=False),
y='사망자',x='지역', color='지역',
log_y=True, template='ggplot2',
title='지역별 사망자수')
fig.show()
base_url = 'http://ncov.mohw.go.kr/bdBoardList_Real.do?brdId=1&brdGubun=13&ncvContSeq=&contSeq=&board_id=&gubun='
response = requests.get(base_url)
soup = bs(response.text, 'html.parser')
table = soup.select('table')
table_html = str(table)
table_list = pd.read_html(table_html)
table_df = table_list[0]
table_df.columns = ['시도명', '전일대비확진자증감', '확진자수', '사망자수', '방생률', '일일검사건수']
table_df = table_df.drop(index=[0, 18], axis=0).reset_index().drop('index', axis=1)
geo_path = './CTPRVN_201905/TL_SCCO_CTPRVN_WGS84.json'
geo_json = json.load(open(geo_path, encoding='utf-8'))
geo_json['features'][0]['properties']['CTP_KOR_NM']
sido = []
for n in range(17):
a = geo_json['features'][n]['properties']['CTP_KOR_NM']
a = str(a)
sido.append(a)
table_df['sido'] = sido
geo_sido = pd.read_csv('data/전국위경도.csv', encoding='cp949')
df = pd.concat([table_df, geo_sido], axis=1)
df['patient_log'] = np.log(df['확진자수'])
df = df.rename(columns={'sido_x':'지역', 'patient_log':'확진자수(log)'})
fig = px.choropleth_mapbox(df, geojson=geo_json, color='확진자수(log)',
locations="sido", featureidkey="properties.CTP_KOR_NM",
center={"lat": 36.093370, "lon": 127.630479},
mapbox_style="carto-positron", zoom=6,
labels={'확진자수(log)':'코로나19 확진자수(log)'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()